# Lab 05: Data Visualization
# ---
#   author: "Jack"
# ---


# Library loading
library(ggplot2)
library(gapminder)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
head(cars)
##   speed dist
## 1     4    2
## 2     4   10
## 3     7    4
## 4     7   22
## 5     8   16
## 6     9   10
# Display stopping distance (ft) as a function of speed (mph) from the 'cars' 
# dataset using ggplot2

# Option 1
# ggplot(data = cars) +
#   aes(x = speed, y = dist) +
#   geom_point() +
#   title(main = "Stopping Distance of Old Cars") +
#   xlab("Speed (MPH)") +
#   ylab("Stopping Distance (ft)")

# Option 2
ggplot(data = cars) +
  aes(x = speed, y = dist) +
  geom_point() +
  geom_smooth(formula = y ~ x, 
              method = "lm") +
  labs(title = "Stopping Distance of Old Cars",
       x = "Speed (MPH)",
       y = "Stopping Distance (ft)")

# Base graphics translation
plot(x = cars$speed, 
     y = cars$dist, 
     pch = 16,
     main = "Stopping Distance of Old Cars\n[base graphics]",
     xlab = "Speed (MPH)",
     ylab = "Stopping Distance (ft)")
abline(lm(cars$dist ~ cars$speed), 
       col = "blue",
       lwd = 2)

# Loading in the genes data
url <- "https://bioboot.github.io/bimm143_S20/class-material/up_down_expression.txt"
genes <- read.delim(url)
head(genes)
##         Gene Condition1 Condition2      State
## 1      A4GNT -3.6808610 -3.4401355 unchanging
## 2       AAAS  4.5479580  4.3864126 unchanging
## 3      AASDH  3.7190695  3.4787276 unchanging
## 4       AATF  5.0784720  5.0151916 unchanging
## 5       AATK  0.4711421  0.5598642 unchanging
## 6 AB015752.4 -3.6808610 -3.5921390 unchanging
nrow(genes)
## [1] 5196
colnames(genes)
## [1] "Gene"       "Condition1" "Condition2" "State"
ncol(genes)
## [1] 4
table(genes$State)
## 
##       down unchanging         up 
##         72       4997        127
round(table(genes$State)["up"]/nrow(genes)*100, 2)
##   up 
## 2.44
p <- ggplot(data = genes) +
  aes(x = Condition1, y = Condition2, 
      col = State) +
  geom_point()
p

p <- p +
  scale_color_manual(values = c("red", "gray", "blue"))
p  

p <- p +
  labs(title = "Gene Expression Changes Upon Drug Treatment",
       x = "Control (no drug)",
       y = "Drug Treatment")
p

# gapminder dataset
gapminder_2007 <- gapminder %>% filter(year==2007)

# Exploring the data
p <- ggplot(data = gapminder) +
  aes(x = year, y = lifeExp) +
  geom_violin(aes(group = year), draw_quantiles = c(0.5)) +
  geom_jitter(aes(col = continent), alpha = 0.4, width = 0.3)
p

ggplotly(p)
p <- ggplot(data = gapminder_2007) +
  aes(x = gdpPercap, y = lifeExp, col = continent, size = pop) +
  geom_point(alpha=0.4)
p

# Gross example with population as color
q <- ggplot(data = gapminder_2007) +
  aes(x = gdpPercap, y = lifeExp, col = pop) +
  geom_point()
q

# Adjusting point size
p <- ggplot(data = gapminder_2007) +
  aes(x = gdpPercap, y = lifeExp, size = pop) +
  geom_point(alpha=0.4)
p

p <- p + scale_size_area()
p

# Exploring 1957 gapfinder data
gapminder_1957 <- gapminder %>% filter(year==1957)

p <- ggplot(data = gapminder_1957) +
  aes(x = gdpPercap, y = lifeExp, col = continent, size = pop) +
  geom_point(alpha=0.7) +
  scale_size_area(max_size = 15)
p

gapminder_1957_2007 <- gapminder %>% filter(year==1957 | year == 2007)

p <- ggplot(data = gapminder_1957_2007) +
  aes(x = gdpPercap, y = lifeExp, col = continent, size = pop) +
  geom_point(alpha=0.7) +
  scale_size_area(max_size = 15) +
  facet_wrap(~year)
p

# Boxplots
gapminder_2007_top5 <- gapminder %>% 
  filter(year == 2007) %>%
  arrange(desc(pop)) %>%
  top_n(5, pop)

gapminder_2007_top5
## # A tibble: 5 x 6
##   country       continent  year lifeExp        pop gdpPercap
##   <fct>         <fct>     <int>   <dbl>      <int>     <dbl>
## 1 China         Asia       2007    73.0 1318683096     4959.
## 2 India         Asia       2007    64.7 1110396331     2452.
## 3 United States Americas   2007    78.2  301139947    42952.
## 4 Indonesia     Asia       2007    70.6  223547000     3541.
## 5 Brazil        Americas   2007    72.4  190010647     9066.
ggplot(data = gapminder_2007_top5) +
  aes(x = country, y = pop) +
  geom_col()